In [1]:

    
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white")

import numpy as np
import pandas as pd
import scipy.io as sio

import sys
sys.path.append('..')

from helper import general
from helper import pca

load data



In [6]:

    
mat = sio.loadmat('./data/ex7data1.mat')
X = mat.get('X')

# visualize raw data
print(X.shape)

sns.lmplot('X1', 'X2', 
           data=pd.DataFrame(X, columns=['X1', 'X2']),
           fit_reg=False)









    



(50, 2)






    Out[6]:





<seaborn.axisgrid.FacetGrid at 0x11264a748>

normalize data



In [3]:

    
X_norm = pca.normalize(X)

sns.lmplot('X1', 'X2', 
           data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
           fit_reg=False)









    Out[3]:





<seaborn.axisgrid.FacetGrid at 0x106c79668>

covariance matrix $\Sigma$

this is biased sample covariance matrix, for unbiased version, you need to divide it by $m-1$



In [4]:

    
Sigma = pca.covariance_matrix(X_norm)  # capital greek Sigma
Sigma  # (n, n)









    Out[4]:





array([[ 1.        ,  0.73553038],
       [ 0.73553038,  1.        ]])

PCA

http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html



In [12]:

    
U, S, V = pca.pca(X_norm)



In [13]:

    
U









    Out[13]:





array([[-0.70710678, -0.70710678],
       [-0.70710678,  0.70710678]])



In [7]:

    
u1 = U[0]
u1









    Out[7]:





array([-0.70710678, -0.70710678])

project data to lower dimension



In [8]:

    
# show top 10 projected data
Z = pca.project_data(X_norm, U, 1)
Z[:10]









    Out[8]:





array([[ 1.49631261],
       [-0.92218067],
       [ 1.22439232],
       [ 1.64386173],
       [ 1.2732206 ],
       [-0.97681976],
       [ 1.26881187],
       [-2.34148278],
       [-0.02999141],
       [-0.78171789]])

http://stackoverflow.com/a/23973562/3943702



In [16]:

    
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 4))

sns.regplot('X1', 'X2', 
           data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
           fit_reg=False,
           ax=ax1)
ax1.set_title('Original dimension')

sns.rugplot(Z, ax=ax2)
ax2.set_xlabel('Z')
ax2.set_title('Z dimension')









    Out[16]:





<matplotlib.text.Text at 0x1163d08d0>

recover data to original dimension

Of course, there would be inevitable information loss if you boost data from lower to higher dimension



In [17]:

    
X_recover = pca.recover_data(Z, U)

fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(12, 4))

sns.rugplot(Z, ax=ax1)
ax1.set_title('Z dimension')
ax1.set_xlabel('Z')

sns.regplot('X1', 'X2', 
           data=pd.DataFrame(X_recover, columns=['X1', 'X2']),
           fit_reg=False,
           ax=ax2)
ax2.set_title("2D projection from Z")

sns.regplot('X1', 'X2', 
           data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
           fit_reg=False,
           ax=ax3)
ax3.set_title('Original dimension')









    Out[17]:





<matplotlib.text.Text at 0x1166f0e80>

the projection from `(X1, X2)` to `Z` could be visualized like this



In [ ]:

load data

normalize data

covariance matrix $\Sigma$

PCA

project data to lower dimension

recover data to original dimension

the projection from (X1, X2) to Z could be visualized like this

the projection from `(X1, X2)` to `Z` could be visualized like this